rm(list=ls(all=TRUE))

##################### REPLICATE MAIN BIITW ANALYSES #############################
### data file created by running BTW replication code) in Rdata format
load("Data/biitw_data_output.Rdata")
### DTA file directly generated from BIITW replication archive:
#df <- read.dta13("Data/s35_112_clean.dta")

### Table 5
#Column 1
col1 <- feols(r_deviate ~ r_order  | ID, data = subset(df , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
#Column 2
col2 <- feols(r_deviate ~ r_order  | interaction(ID,CONGRESS), data = subset(df , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other")) 
#Column 3 
col3 <- feols(r_deviate ~ r_order +r_oparty_fracdeviate_loo + r_Moparty_fracdeviate_loo | interaction(ID,CONGRESS), data = subset(df , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
#Column 4
col4 <- feols(r_deviate ~ r_order_overall  | ID, data = subset(df , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
#Column 5
col5 <- feols(r_deviate ~ r_order_overall  | interaction(ID,CONGRESS), data = subset(df , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other")) 
#Column 6
col6 <- feols(r_deviate ~ r_order_overall + r_oparty_fracdeviate_loo + r_Moparty_fracdeviate_loo | interaction(ID,CONGRESS), data = subset(df , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))

table2 <- list(col1 , col2 , col3 , col4 , col5 , col6)

### create txt file with results
sink("Generated Files/Original Table 5.txt")
column <- 1
for(i in table2){
  cat("Column ",column,"\n")
  summary(i , cluster = "CONGRESS") %>% 
    print
  cat("R2 = " , round(r2(i)[2],3),"\n\n")
  column <- column + 1
}
sink(file = NULL)

################ RECREATE DATASET AND FINDINGS FROM SOURCE ######################

da <- fread("Data/Sall_votes.csv" , stringsAsFactors = FALSE)

### BIW identified errors in 81st congress; use their corrected data of membership
da_temp <- read_excel("Data/sen81corr.xlsx")
da_temp2 <- subset(da_temp , select = c("id","cong",paste0("V",1:455))) %>%
  reshape2::melt(. , id.vars = c("id","cong") , variable.name=c("rollnumber")) %>%
  mutate(. , rollnumber=str_replace(rollnumber,"V",""))

colnames(da_temp2)[1:2] <- c("icpsr" ,"congress")

da <- join(da , da_temp2)
da <- subset(da , is.na(value))
#### dropped 81st congress votes for which member was not present

### subset to 35th to 117th
da <- subset(da , congress > 34 & congress <= 117 & chamber=="Senate")
da$roll_call <- paste0(da$congress , "_", da$rollnumber)

### merge in members' names to match up to their ICPSR code
da_temp <- read.csv("DATA/Sall_members.csv" , stringsAsFactors = FALSE)

da <- join(da , subset(da_temp , select = c("congress","icpsr","state_abbrev","party_code","bioname")) , by = c("congress","icpsr"))
### drop presidents, clean up names
da <- subset(da , state_abbrev!="USA") %>%
  mutate(. , bioname = toupper(bioname))

### cast codes are 1 (yea; 2 paired 3 announced) 6 (nay; 4 announced 5 paired) 7/8 present and 9 abstain
### seeking to copy the procedure used in the original for constructing deviation
da$yea <- NA
da$yea[da$cast_code%in%c(1,2,3)] <- 1
da$yea[da$cast_code%in%c(4,5,6)] <- 0
### original paper considers 1,2,3 as yeas
da$not_voting <- as.numeric(da$cast_code==9)

### construct vote ordering ranking, verbose here to add rank and n_members (for 
### comparison to original BIW data file later)
da <- da[order(da$roll_call , da$bioname),]
da <- da[, `:=` (rank = seq_along(bioname),
                 n_members = length(bioname)
) , by = roll_call ]
da$rank_p_overall <- (da$rank-1)/(da$n_members - 1)

### clean up party; only Ds and Rs will be included in regressions, all used for ordering
da$r_party <- "other"
da$r_party[da$party_code==100] <- "Democrat"
da$r_party[da$party_code==200] <- "Republican"
table(da$party_code)
### vote-level totals; separate close / lopsided votes, identify party position
da <- da[, `:=` (dem_yes =  sum(na.omit(yea[r_party=="Democrat"])),
                 rep_yes =  sum(na.omit(yea[r_party=="Republican"])),
                 tot_yes =  sum(na.omit(yea)),
                 dem_voters = length(na.omit(yea[r_party=="Democrat"])),
                 rep_voters = length(na.omit(yea[r_party=="Republican"])),
                 tot_voters = length(na.omit(yea))
) , by = roll_call ]

### define and identify the party position on each vote
da$party_vote <- 0
da$party_vote[da$r_party=="Democrat" & (da$dem_yes - da$yea) > (da$dem_voters - da$dem_yes - (1- da$yea))] <- 1
da$party_vote[da$r_party=="Republican" & (da$rep_yes - da$yea) > (da$rep_voters - da$rep_yes - (1- da$yea))] <- 1
da$party_vote[da$r_party=="Democrat" & (da$dem_yes - da$yea) == (da$dem_voters - da$dem_yes - (1- da$yea))] <- NA
da$party_vote[da$r_party=="Republican" & (da$rep_yes - da$yea) == (da$rep_voters - da$rep_yes - (1- da$yea))] <- NA
da$party_vote[da$r_party=="Democrat" & is.na(da$yea) & (da$dem_yes) > (da$dem_voters - da$dem_yes)] <- 1
da$party_vote[da$r_party=="Republican" & is.na(da$yea) & (da$rep_yes) > (da$rep_voters - da$rep_yes)] <- 1
#subset(da , cast_code%in%c(7,8)) %>% with(. , table(deviate , party_vote))

mean(is.na(da$yea))
mean(is.na(da$party_vote[!is.na(da$yea)]))
with(da , mean(is.na(party_vote) & !is.na(da$yea)))
ddply(da , .(congress) , function(x) mean(is.na(x$party_vote) & !is.na(x$yea)))
with(da , sum(is.na(party_vote) & !is.na(da$yea)))
### About 0.8%, or 27K votes, are split within party (excluding a given member's own vote). Treat them as NA as in original paper

da$deviate <- as.numeric(da$yea!=da$party_vote)
da$deviate[is.na(da$yea)] <- NA
### Voting present is deviation if your party position is support, but not oppose
### this is debatable, as voting present is akin to missing a vote, as the senate
### requires a majority of those casting votes (i.e. not voting present) to pass a bill
da$deviate[da$cast_code%in%c(7,8)] <- 1
da$deviate[da$cast_code%in%c(7,8) & da$party_vote==0] <- 0

### original paper drops other parties (including bernie) and paired votes
### I keep them in the dataset, but drop them in regressions

# Look at the number of members for each vote by congress by size of congress
senate.size <- data.frame(congress = c(35:117) ,
                          seats = c(66,66,50,52,54,68,74,74,74,76,
                                    76,76,76,76,76,76,88,88,88,90,
                                    90,90,90,90,90,92,92,96,96,96,96,
                                    96,96,96,96,96,96,96,96,96,96,96,
                                    96,96,96,96,96,96,96,96,96,rep(100,32)))
da <- join(da , senate.size)
ddply(da , .(congress) , function(x) mean(x$n_members != x$seats))
### weird - data problems from voteview
with(da , mean(n_members > seats))
### 2.1% have more members than seats in the legislature at the time; this AFTER fixing 81st congress
with(da , mean(n_members < seats))
### 22.7% have fewer members than seats

### add supermajority and close indicators from original BIW (which uses Snyder and Groseclose data)
da <- subset(df , select = c("r_superm","r_close","CONGRESS","CALL")) %>% 
  unique(.) %>% 
  setnames(. , c("r_superm","r_close","congress","rollnumber")) %>% 
  join(da , .)
### which senates had party control > of 62% of seats?
### 35, 38-43 (6), 57-61 (5), 74-77 (4), 86-90 (5), 
### 37 congress Republicans held exactly 62% S & G say 'more than 62 percent'

##with(da , table(congress, r_superm , useNA = 'always'))
### no data on supermajority requirements for 113-117

### add fraction of copartisans deviating
da <- da %>% 
  mutate(. , roll_call_party = paste0(roll_call , r_party)) %>% 
  .[order(.$roll_call_party,  .$bioname),]

da <- da[, `:=` (#oparty_fracdeviate_loo_prior = (cumsum(na.omit(deviate)) - deviate)/(seq_along(bioname)-1),
                 oparty_fracdeviate_loo = (sum(na.omit(deviate)) - deviate)/(length(na.omit(deviate))-1)) , 
         by = roll_call_party ]
da$oparty_fracdeviate_loo_M <- as.numeric(is.na(da$oparty_fracdeviate_loo))
da$oparty_fracdeviate_loo[is.na(da$oparty_fracdeviate_loo)] <- 1

#### add roll-call specific order - create new file because I will need those 
### not voting for simulations later
da2 <- subset(da , !cast_code%in%c(9))
da2 <- da2[order(da2$roll_call , da2$bioname),]
da2 <- da2[, `:=` (rank_p = (seq_along(bioname)-1)/(length(bioname)-1)) , by = roll_call ]

### Recreate original Table 2 - 



### Table 2 - re-created
#Column 1
col1 <- feols(deviate ~ rank_p  | icpsr, data = subset(da2 , congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))
#Column 2
col2 <- feols(deviate ~ rank_p  | interaction(icpsr,congress), data = subset(da2 , congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))
#Column 3
col3 <- feols(deviate ~ rank_p + oparty_fracdeviate_loo + oparty_fracdeviate_loo_M  | interaction(icpsr,congress), data = subset(da2 , congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))
#Column 4
col4 <- feols(deviate ~ rank_p_overall  | icpsr, data = subset(da , congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))
#Column 5
col5 <- feols(deviate ~ rank_p_overall  | interaction(icpsr,congress), data = subset(da , congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))
#Column 6
col6 <- feols(deviate ~ rank_p_overall + oparty_fracdeviate_loo + oparty_fracdeviate_loo_M  | interaction(icpsr,congress), data = subset(da2 , congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))


table2 <- list(col1 , col2 , col3 , col4 , col5 , col6)

### create txt file with results
sink("Generated Files/Replicated Table 5.txt")
column <- 1
for(i in table2){
  cat("Column ",c(1,2,3,4,5,6)[column],"\n")
  summary(i , cluster = "congress") %>% 
    print
  cat("R2 = " , round(r2(i)[2],3),"\n\n")
  column <- column + 1
}
sink(file = NULL)


#### save data files for extensions
save(da , da2 ,
     file = "Data/Temp Files/BIW_replicated_datasets.RData")


load("Data/Temp Files/BIW_replicated_datasets.RData")
### Table 7 - re-created
#Column 1
col1 <- feols(deviate ~ rank_p  | icpsr, data = subset(da2 , congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))
#Column 2
col2 <- feols(deviate ~ rank_p  | icpsr, data = subset(da2 , r_close==0 & congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))
#Column 3
col3 <- feols(deviate ~ rank_p  | icpsr, data = subset(da2 , r_close==1 & congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other"))



#Column 1
col1 <- feols(r_deviate ~ r_order  | ID, data = subset(df , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
#Column 2
col2 <- feols(r_deviate ~ r_order  | ID, data = subset(df , r_close==0 & r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
#Column 3
col3 <- feols(r_deviate ~ r_order  | ID, data = subset(df , r_close==1 & r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
summary(col1 , cluster = "CONGRESS")
summary(col2 , cluster = "CONGRESS")
summary(col3 , cluster = "CONGRESS")

subset(df , CONGRESS%in%c(35,38:43,57:61,74:77,86:90) & 
         r_yeas_fraction <=0.7 & r_yeas_fraction >= 0.3 & r_superm==0) %>% with(. , table(r_close))
### oops - 90,196 votes incorrectly defined as lopsided when SG would have it as close)

df2 <- df
df2$r_close[df2$CONGRESS%in%c(35,38:43,57:61,74:77,86:90) & 
              df2$r_yeas_fraction <=0.7 & df2$r_yeas_fraction >= 0.3 & 
              df2$r_superm==0] <- 1

#Column 1
col1 <- feols(r_deviate ~ r_order  | ID, data = subset(df2 , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
#Column 2
col2 <- feols(r_deviate ~ r_order  | ID, data = subset(df2 , r_close==0 & r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
#Column 3
col3 <- feols(r_deviate ~ r_order  | ID, data = subset(df2 , r_close==1 & r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other"))
summary(col1 , cluster = "CONGRESS")
summary(col2 , cluster = "CONGRESS")
summary(col3 , cluster = "CONGRESS")

feols(r_deviate ~ r_order + r_oparty_fracdeviate_loo + r_Moparty_fracdeviate_loo  | ID_CONGRESS, data = subset(df2 , r_close==1 & r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other")) %>% 
summary(. , cluster = "CONGRESS")

subset(df2 , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other") %>% 
  with(. , hist(r_oparty_fracdeviate_loo))



#####################################################################
############# Results slightly, slightly differ - why? ##############
#####################################################################



df2 <- subset(df , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other")
dim(df)
df2 <- subset(df2 , select = c("ID","CONGRESS","CALL","lname","fname","r_party","r_paired","r_abstain","r_deviate","r_rank_overall","r_order","r_order_overall",
                               "r_voters","r_yea","r_oparty_line","r_Mdeviate","r_Morder","r_paired","r_members_tot"))
df2 <- mutate(df2, roll_call = paste0(CONGRESS,"_",CALL) ,
              icpsr = ID ,
              congress = CONGRESS)

### replication has 374 more observations than original - why?
da_missing <- join(da , df2 , by = c("congress","roll_call","icpsr","r_party")) %>%
  subset(. , is.na(lname) & congress < 113 & !cast_code%in%c(2,3,4,5,9) & r_party!="other" & !is.na(deviate))
dim(da_missing)
### It's actually 1,161 observations in replication that aren't in original

apply(da_missing , 2, function(x) sum(is.na(x)))

da_missing %>% with(. , table(r_Mdeviate , useNA = 'always'))
da_missing %>% with(. , table(cast_code , useNA = 'always'))
da_missing %>% with(. , table(congress , useNA = 'always'))

da_missing %>% with(. , table(congress ,cast_code , useNA = 'always'))
### OK - VoteView corrected 785 entries from 45th congress. These pertain to actual votes. 
### It appears VoteView changed pairs to actual votes.
### BIW changed the 112th votes themselves in the code, which I did not replicate
### as it wasn't commented.

### now let's look at present voters
temp <- subset(da_missing , cast_code%in%c(7,8)) %>%
  mutate(. , matchingid = paste0(roll_call,"_",icpsr)) %>%
  join(mutate(df , matchingid = paste0(CONGRESS,"_",CALL,"_",ID)) , . , by = "matchingid") %>%
  subset(. , !is.na(roll_call))
dim(temp)
head(temp)
table(temp$r_abstain)
table(temp$r_Mdeviate)

subset(da_missing , is.na(yea)) %>% with(. , table(r_Mdeviate))
subset(df , CALL==115 & ID==5227 & CONGRESS==74)
subset(da_missing , congress==74) %>% head()
subset(df , CONGRESS==74 & ID%in%c("2575","5227","6441") & CALL==115)
### others appear to be senators listed as abstaining whom VV now lists as voting present
### ok, so book is closed on why new data has more obs than original BIW

### Looking at overlapping data, do we have the same IV and DV measures?
df3 <- join(df2 , da , by = c("congress","roll_call","icpsr","r_party"))
dim(df3)
head(df3)

### there are also 11 votes in their dataset I don't have
sum(is.na(df3$seats))
subset(df3 , is.na(seats)) %>% with(. , table(congress))
### again, 45th congress, which appears to be congress VV fixed

mean(is.na(df3$r_deviate))
sum(is.na(df3$deviate))
### why do I have some missing data?
subset(df3 , is.na(deviate)) %>% head()
subset(df3 , is.na(deviate)) %>% with(. , table(CONGRESS))
subset(da , icpsr==10268 & roll_call=="45_1")
### More corrections from 45th congress + the 2 handcoded from 112th
### Look at similarities for other votes
df3 <- subset(df3 , !is.na(deviate))
mean(df3$r_deviate==df3$deviate)
### over 99.9% of DV match in the new data - why do others not match?

subset(df3 , !is.na(deviate) & r_deviate!=deviate) %>%
  with(. , table(congress))
### again, 45th congress

df3  %>%
  with(. , mean(n_members==r_members_tot))
### this is pretty good, but it should be perfect. Perhaps it's the ones they fixed by hand
subset(df3 ,n_members!=r_members_tot) %>%
  with(. , table(congress))
### again the 45th. Also the 106th - why?

subset(df3 ,n_members!=r_members_tot & congress==106 & CALL==177) %>%
  dim()

subset(df , CONGRESS==106 & CALL==177) %>% dim()
subset(df , CONGRESS==106 & CALL==177) %>% with(. , table(r_member))
subset(df , CONGRESS==106 & CALL==177 & r_member==0)

subset(da , roll_call=="106_177") %>% dim()
subset(da , roll_call=="106_177" & icpsr%in%c(14500,49905,49904))
### I have John Chafee as a member but not voting, they have him as not a member. VV discrepancy.

### Given total members are the same, what about individual rankings within membership

subset(df3 , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other" &
         !is.na(deviate) & n_members==r_members_tot) %>%
  with(. , mean(round(r_order_overall,5)==round(rank_p_overall,5)))
### this is off by substantially more - why are they not the same?
subset(df3 , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other" &
         !is.na(deviate) & n_members==r_members_tot) %>%
  with(. , sum(round(r_order_overall,5)!=round(rank_p_overall,5)))

subset(df3 , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other" &
         !is.na(deviate) & n_members==r_members_tot &
         round(r_order_overall,5)!=round(rank_p_overall,5)) %>%
  head()
### clearly not just the 45th congress again

subset(df3 , r_Mdeviate==0 & r_Morder==0 & r_paired==0 & r_party!="other" &
         !is.na(deviate) & n_members==r_members_tot &
         round(r_order_overall,5)!=round(rank_p_overall,5)) %>%
  with(. , table(congress))
### mostly recent congresses
data.frame(replication = subset(da , roll_call=="103_1") %>% with(. , sort(bioname)),
           biw = subset(df , CONGRESS==103 & CALL==1 & r_Morder==0) %>% with(. , sort(paste0(lname,", ",fname)))
)
subset(df , CONGRESS==103 & CALL==1 & r_Morder==0 & lname=="BRAUN")
subset(da_temp , icpsr==49303)
### Carol Mosely Braun is listed in BIW as 'braun', so her rankings
### are very off, as are everyone's between Braun and Mosely
###; I merge into the roll call data VoteView's member profile,
### which has a column "bioname" which has full name; looks like
### my rankings should be rock solid. Any other surnames off?

subset(df3 , !is.na(bioname)) %>%
  with(. , mean(substr(lname , 1 , 3)!=substr(bioname,1,3)))

subset(df3 , !is.na(bioname) &
         substr(lname , 1 , 3)!=substr(bioname,1,3)) %>%
  with(. , table(bioname))
### Lambert Lincoln also incorrect in original data; 
### original data also drops apostrophes; does that matter?
subset(da , bioname=="O'GORMAN, JAMES ALOYSIUS") %>% with(. , table(congress))
subset(da , bioname=="O'DANIEL, WILBERT LEE (PAPPY)") %>% with(. , table(congress))
subset(da , bioname=="O'CONOR, HERBERT ROMULUS") %>% with(. , table(congress))
subset(da , bioname=="O'MAHONEY, JOSEPH CHRISTOPHER") %>% with(. , table(congress))

subset(da , bioname=="O'MAHONEY, JOSEPH CHRISTOPHER" & congress==80) %>% with(. , rank) %>% head()
subset(da , roll_call=="80_100" & rank > 60 & rank < 80)
subset(da , roll_call=="82_100" & rank > 60 & rank < 80)
subset(da , roll_call=="76_100" & rank > 60 & rank < 80)

subset(da , roll_call=="62_1" & rank > 50 & rank < 60)
subset(df , CONGRESS==62 & CALL==1 & r_rank_overall > 50 & r_rank_overall < 60) %>%
  with(. , .[order(r_rank_overall),])
### doesn't look like the apostrophes matter.
### in conclusion - they fixed some ranking errors in voteview data,
### voteview fixed many others, but some minordata errors remain.
### nevertheless, replication results are very close to original.



#####################################################################
############# How big are estimated vote order effects? #############
#####################################################################

deviation_by_senator <- subset(da , !cast_code%in%c(2,3,4,5,9) & r_party!="other") %>% 
  ddply(. , .(congress , r_party , bioname) , function(x) mean(na.omit(x$deviate)))
tail(deviation_by_senator)      

ddply(deviation_by_senator , .(congress , r_party) , function(x) range(x$V1))      
subset(deviation_by_senator , congress==117) %>% with(. , .[order(r_party , V1),])
subset(deviation_by_senator , congress==116) %>% with(. , .[order(r_party , V1),])

subset(deviation_by_senator , congress>=116) %>% ddply(. , .(r_party,congress), function(x) mean(x$V1))

ddply(deviation_by_senator , .(congress , r_party) , function(x) mean(x$V1))      
mean(deviation_by_senator$V1)
ddply(deviation_by_senator , .(congress) , function(x) mean(x$V1))      

subset(da , !cast_code%in%c(2,3,4,5,9) & r_party!="other") %>% 
  ddply(. , .(congress) , function(x) mean(na.omit(x$deviate))) %>% 
  subset(. , congress >=93) %>% with(. , mean(V1))
